import requests
import csv
from bs4 import BeautifulSoup
import re

#定义寻找特征字符位置的函数
def find_en(st, text):
    position = []
    n = 0
    for m in range(0, 1000):
        pos = body.find(st, n, len(text))
        n = pos + 1
        position.append(pos)
        if position[m] == -1:
            break
    return position

#定义修正中英文冒号的函数，将中文冒号的QA插入
def correct(datalist1, datalist2):
    if len(datalist1) != 1:
        for m in range(0, len(datalist1)-1):
            for n in range(0, len(datalist2)-1):
                if datalist2[n] > datalist1[m]:
                    datalist2.insert(n, datalist1[m])
                    break

#去除可能出现在A中的小标题
def delete(list1):
    for m in range(0, len(list1)):
        l = list1[m].find('\n\n\n\n')
        if l != -1:
            list1[m] = list1[m][0:l + 1]

if __name__ == "__main__":
    #定义列表和特征字符
    #url_ipynb = ['https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.7/training_visual_design.html']
    url_html = ['https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/overview.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/preparation.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/script_analysis.html', \
    'https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.7/migrate_3rd_scripts_mindconverter.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/migration_script.html', \
    'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/training_process_comparision.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/typical_api_comparision.html', \
    'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/optim.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/use_third_party_op.html', \
    'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/neural_network_debug.html', 'https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.7/accuracy_problem_preliminary_location.html', 'https://www.mindspore.cn/mindinsight/docs/zh-CN/r1.7/accuracy_optimization.html', \
    'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/performance_optimization.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/inference.html', \
    'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/sample_code.html', 'https://www.mindspore.cn/docs/zh-CN/r1.7/migration_guide/faq.html']

    process = ['概述', '准备工作', '网络脚本分析', '使用MindConverter迁移模型定义脚本', '迁移脚本', '基本执行流程横向对比', '典型算子或接口区别介绍', '优化器迁移指南', '基于自定义算子接口调用第三方算子库', \
    '网络调试', '精度问题初步定位指南', '精度问题详细定位和调优指南', '性能调试', '推理执行', '网络迁移调试实例', '常见问题']

    tag = ['白皮书', 'MindSpore', '全场景', '自动微分', 'GradOperation', 'MindSpore', 'Parameter Server', \
           '并行', '优化器', 'Host&Device', '分布式', '算子', '接口', '数据集', '集合通信', '同步模式', '损失函数', '控制流', \
           '自由变量', '闭包', 'MindRecord', 'Pipeline', 'Adaptor', 'Optimizer', 'Runtime', 'Operators', 'Callback', \
           'MindSpore AKG', 'Auto-Tiling', 'Auto-Mapping', 'Davinci', '样例脚本', 'GPU', 'CPU', 'CuBLAS库', 'Linux', 'Ascend', \
           '模型推理', 'MindInsight', 'plugin_name', 'MindArmour', 'AI Fuzzer', 'MINDIR', '缩略语', \
           '数据类型', '原型', '运算符', '语句', '内置函数', '网络', \
           'PyNative', 'MindConverter', '评估', 'TensorFlow', 'Pytorch', 'ONNX', 'PB', 'API', 'Model', 'SGD', 'loss', 'metric', 'ARM', 'AST', \
           'TensorBoard', 'Dataloader', 'GeneratorDataset', ' GradOperation', 'TrainOneStepCell', '反向传播', '求导', 'weight decay', 'LR', '学习率', \
           'Aten', '调试', 'MindOptimizer', '精度', 'checklist', '数据', 'loss scale', '超参', '计算图', '精度', '可视化', '模型结构', \
           '优化', '迭代', 'ModelZoo', 'MindSpore Serving', '脚本', '单机', 'profiling', 'MindData', '框架', '性能', 'Dataset', 'macOS', 'SDK版本', 'SciPy',\
           'whl包', 'protobuf', 'Ubuntu', 'Windows', 'WSL', 'Conda', 'Serving', 'gmp', 'cuda', 'JupyterLab',  'eval', 'NLP', 'HCLL', 'ModelArts', 'Graph', \
           'C++', 'AIPP', 'OpenMPI', 'NCLL', 'RDMA', 'IB', 'RoCE', 'taichi', 'Caffe', 'NPU', 'Embedding', 'Parameter Server', 'JIT Fallback']
    tag_change = []
    Dataset = []
    #转义tag中的特殊符号
    for i in tag:
        tag_change.append(re.escape(i))
    #十一个语料库大循环
    for numbers in range(0, len(process)):
        #获取网页文本
        req = requests.get(url=url_html[numbers])
        req.encoding = 'utf-8'
        html = req.text
        bs = BeautifulSoup(html, "html.parser")#html.parser是解析器
        All_title = bs.find_all("div", attrs={"class": "section"})#所有标题

        #定义二级标题及其内容
        pre_title_2 = []
        pre_title_1 = []
        title = []
        content = []
        #获取所有二级标题
        for one_title in All_title:
            h_1 = one_title.find('h1')
            h_2 = one_title.find('h2')
            if h_1 != None:
                pre_title_1.append(process[numbers].lower())# 一级标题名字
            if h_1 == None and h_2 != None:
                title_2 = h_2.get_text()#二级标题名字

                pre_title_2.append(title_2)
        #提取一级标题和二级标题之间的内容
        result = bs.select("#" + pre_title_1[0])
        body = result[0].get_text()
        title_1_location = find_en("¶", body)
        # 有二级标题 如："二阶优化器THOR介绍" or "训练可视总体设计"
        if len(pre_title_2) > 0:
            title_2_location = find_en(pre_title_2[0], body)
            body_1 = body[title_1_location[0] + 1 : title_2_location[0]]
            # 判断一级标题到二级标题间有无内容
            if len(body_1) > 20:
                title.append(pre_title_1[0] + "§" + pre_title_1[0])
                content.append(body_1)
        # 没有二级标题 如："全场景统一" or "术语"
        else:
            body_1 = body[title_1_location[0] + 1: ]
            title.append(pre_title_1[0] + "§" + pre_title_1[0])
            content.append(body_1)

        #提取二级标题及其内容
        for title_2 in pre_title_2:
            title_mis = title_2.rstrip('¶')
            # 标题有特殊符号需要替换
            flag_1 = "&" in title_mis
            flag_2 = " " in title_mis
            if flag_1 == True:
                title_mis = title_mis.replace("&", "")
            if flag_2 == True:
                title_mis = title_mis.replace(" ", "-")
            title_mis = title_mis.lower()
            result = bs.select("#" + title_mis)
            body = result[0].get_text()
            title_2_location = find_en(title_2, body)
            body_1 = body[title_2_location[0] + len(title_2):]
            body_1 = body_1.replace('¶', '')
            title_2 = title_2.rstrip('¶')
            title.append(pre_title_1[0] + "§" + title_2)
            content.append(body_1)

            # 匹配tag
            tag1 = []
            tag2 = []
            for m in range(0, len(title)):
                str8 = title[m] + content[m]
                for i in tag_change:
                    t = re.search(i, str8, flags=re.IGNORECASE)
                    if t != None:
                        tag2.append(t.group(0))
                tag1.append(tag2)
                tag2 = []
            # 将tag列表合并为字符串
            FinalTag = []
            for m in range(0, len(title)):
                ft_str = '、'.join(tag1[m])
                FinalTag.append(ft_str)

        # 将各个二级标题元素合并，用于写入csv文件
        for n in range(0, len(title)):
            Dataset.append([title[n], url_html[numbers], content[n], process[numbers], FinalTag[n]])
    # 插入csv
    headers = ('title', 'link', 'content', 'process', 'tag')  #
    with open('html_migration_dataset.csv', 'w', encoding='utf-8', newline='') as f:
        write = csv.writer(f)  # 创建writer对象
        write.writerow(headers)
        for n in range(len(Dataset)):
            write.writerow(Dataset[n])
